# libraries of code that we use to abstract away the training process
!pip install -U transformers datasets peft bitsandbytes

# import those libraries for use
from datasets import load_dataset
from transformers import (
    DataCollatorForLanguageModeling,
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
)
from peft import LoraConfig, get_peft_model
import torch

# config
# the base model you'll use, the dataset you'll use,
# where to save the model afterward in the file system
model_id = "Qwen/Qwen2.5-3B"
dataset_id = "Ozziey/poems_dataset"
output_dir = "./qwen3b-lora-finetuned"

# since the models are large, we can opt to finetune smaller 'adapters' on them
# to save some memory, and fit longer texts through them
# these are the 'rank' (think: size, capacity) and
# 'alpha' (think: strength of influence on model)
lora_r = 2
lora_alpha = 32

# these are specific components inside the model we're going to add adapters to
lora_target_modules = [
    # inside the 'attention' layers
    "q_proj", "k_proj", "v_proj", "o_proj",
    # inside the 'mlp' or 'fully connected' layers
    # "up_proj", "gate_proj", "down_proj"
]

# load the model we're going to adapt
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    attn_implementation="sdpa",
)
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

# set the adapter configuration, and add them to the model
peft_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    target_modules=lora_target_modules,
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, peft_config).cuda()

/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: 
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  warnings.warn(
`torch_dtype` is deprecated! Use `dtype` instead!

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

# prepare dataset
dataset = load_dataset(dataset_id, split="train").shuffle()

def format_fn(example):
    return {'text': example['poem content']}

dataset = dataset.map(format_fn, remove_columns=dataset.column_names)

# convert the words into word ids ('tokens') the model can understand
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding=False)

tokenized_dataset = dataset.map(tokenize_function,
                                remove_columns=dataset.column_names)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

dataset = load_dataset("csv", data_files = ["my_data.csv"]).shuffle()

# split the dataset into train and test
# so we can see if it's only memorizing the train data or not
split_datasets = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split_datasets["train"]
val_dataset = split_datasets["test"]

# set training configuration
training_args = TrainingArguments(
    output_dir=output_dir,
    # these two specify how many texts to process at once &
    # how many time to do that per every update to the model
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,

    # how fast you train the model, you have to be gentle here
    learning_rate=3e-4,
    # what percentage of the steps will be spent slowly turning the learning
    # rate from 0 to your specified value
    warmup_ratio=0.1,

    # the specific optimization algorithm used, 'adamw_torch' is standard
    # you can use 'paged_adamw_8bit' to save some memory
    optim="paged_adamw_8bit",

    # how many times to loop over your data during training (too many will have
    # the model memorize the data, 5 might even be too much... just try to have
    # lots of data, decrease the adapter rank if you get memorization)
    num_train_epochs=4,

    # will show you the performance every `n` steps, save the model every `n`
    # (just in case)
    logging_steps=32,
    save_steps=128,
    load_best_model_at_end=True,

    # how often to run the modl over the testing set to see if it generalizes
    # and isn't purely memorizing / 'over-fitting' on the train set
    eval_strategy="steps",
    eval_steps=32,
    # <- this library does this dumb ass thing where eval_steps takes precedent
    # over logging_steps and it wont log anything to console at logging_steps
    # only at eval_steps

    # our data-type, bf16 is fast and uses 16 bits, another common choice is
    # fp32 for higher precision but slower training, taking up more VRAM
    bf16=True,

    # this can be set to `wandb` if you have an account on wandb.ai
    # it's great it shows you graphs that you can check on your phone while you
    # step away from your compuer
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# train and save, you'll want the loss to trend downards over time.
# for narrow (task specific) datasets, rule of thumb would be:
# 3 is eh, lil over 2 is pretty good, close to 1 is memorizing

# for more general datasets with wide range of types of docs
# a good loss will be a little bit higher

# it's really dependent on your task type(s)

# this can take like an hour, make sure it's not gonna take >3hr (reduce epochs)
# that's around the limit for google colab's free runtime
# can check time you have left by clicking the 'ram/disk_' graph top right
# "At your current usage level, this runtime may last up to ..."
trainer.train()
trainer.save_model(output_dir)

/tmp/ipython-input-1267283505.py:48: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.
  trainer = Trainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.

prompt = "None"

inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
outputs = model.generate(
    **inputs, do_sample = True,
    temperature = 1.0, min_p = 0.1,
    max_new_tokens = 256
)
print(tokenizer.decode(outputs[0]))

None, never mind me,
I've got a little bottle of this, of that.
I've got a little bottle of everything,
and a little spoon,
and a little tin box for the medicine.
I've got a little bottle of everything.

But what's the good of me,
and what's the good of you?
You're only one,
and I'm only one.
The whole world's only one,
and that's the whole world's only one.

Oh, that's enough for me,
and that's enough for you.
I've got a little bottle of this, of that.
I've got a little bottle of everything,
and a little spoon,
and a little tin box for the medicine.
I've got a little bottle of everything.

I've got a little bottle of everything,
and a little spoon,
and a little tin box for the medicine.
I've got a little bottle of everything.
None, never mind me,
I've got a little bottle of this, of that.


1924.<|endoftext|>

# if you have a huggingface account, you can save the model to the account with
model.push_to_hub("username/model_name")
tokenizer.push_to_hub("username/model_name")

# and then later on to use the model all you'll have to do is
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model = AutoModelForCausalLM.from_pretrained(
    "username/model_name",
    torch_dtype=torch.bfloat16,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("username/model_name")

# and the generating code from before
prompt = "Once upon a time,"
inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
outputs = model.generate(
    **inputs, do_sample = True,
    temperature = 1.0, min_p = 0.1,
    max_new_tokens = 256
)
print(tokenizer.decode(outputs[0]))

Step	Training Loss	Validation Loss
32	3.470600	3.215731
64	3.288400	3.199121
96	3.302300	3.206150
128	3.216800	3.228344
160	3.146000	3.239932
192	3.123500	3.262610

training the model¶

test out the trained model¶